The Future Is Now

Table of Contents


Packages


install.packages(c("xgboost", "text2vec", "keras", "lime"))

library(dplyr)

library(ggplot2)

library(httr)

library(keras)

library(lime)

library(magick)

library(parallel)

library(rsample)

library(rvest)

library(text2vec)

library(xgboost)

# DO NOT RUN:

install_keras(tensorflow = "gpu")

An Unvisited Past

Before we really get into what might be coming in the (near) future, it would be useful to get a glimpse of the what was helpful in the past.

Many of the things that we have done together have been computationally intensive. Sometimes there is nothing to be done and you just have to wait. Many problems, though, present well for using parallel computing. Instead of running a job on one core of your machine, you can choose to split the job up over how many cores your machine has (usually leaving 1 open for safety reasons).


pictureLinks = read_html("https://mendoza.nd.edu/mendoza-directory?employeetype=All") %>% 
  html_nodes(".directory-image .profileimage") %>% 
  html_attr("src")

pictureLinks = pictureLinks[!duplicated(pictureLinks)]

pictureLinks = gsub("\\?.*", "", pictureLinks)

personNumbers = sprintf("%03d", 1:length(pictureLinks))

dir.create("D:/projects/courses/unstructured/data/kerasImages/monsters")

dir.create("D:/projects/courses/unstructured/data/kerasImages/people")

cl = makeCluster(detectCores() - 1)

clusterExport(cl, c("pictureLinks", "personNumbers"))

clusterEvalQ(cl, library(httr))

clusterEvalQ(cl, library(rvest))

parSapply(cl, 1:length(pictureLinks), function(x) {
  GET(pictureLinks[x], 
      write_disk(paste("D:/projects/courses/unstructured/data/kerasImages/people/person", 
                       personNumbers[x], ".jpg", sep = "")))
})

stopCluster(cl)


### Monsters ###

monsterLinks = read_html("http://www.iwozhere.com/SRD/Gallery.html") %>% 
  html_nodes(".gallerynamelink") %>% 
  html_attr("href")

monsters = grepl("(^monsters|^images).*jpg", monsterLinks)

monsterLinks = monsterLinks[which(monsters == TRUE)]

monsterLinks = sample(monsterLinks, 325, replace = FALSE)

monsterLinks = paste("http://www.iwozhere.com/SRD/", monsterLinks, sep = "")

monsterNumbers = sprintf("%03d", 1:length(monsterLinks))

cl = parallel::makeCluster(7)

clusterExport(cl, c("monsterLinks", "monsterNumbers"))

clusterEvalQ(cl, library(httr))

clusterEvalQ(cl, library(rvest))

parSapply(cl, 1:length(monsterLinks), function(x) {
  GET(monsterLinks[x], 
      write_disk(paste("D:/projects/courses/unstructured/data/kerasImages/monsters/monster", 
                       monsterNumbers[x], ".jpg", sep = "")))
})

stopCluster(cl)

Now that we have our data (and much faster than what we would have normally), we can split our data up into training, validation, and testing folders.


allPeople = list.files(path = "D:/projects/courses/unstructured/data/kerasImages/people/", 
                       full.names = TRUE)

allMonsters = list.files(path = "D:/projects/courses/unstructured/data/kerasImages/monsters/", 
                         full.names = TRUE)

# Split people into train/validate/test

allPeople = list.files(path = "D:/projects/courses/unstructured/data/kerasImages/people/", 
                       full.names = FALSE)

allMonsters = list.files(path = "D:/projects/courses/unstructured/data/kerasImages/monsters/", 
                         full.names = FALSE)

# Split people into train/validate/test

set.seed(1001)

peopleTrain = sample(1:length(allPeople), round(length(allPeople) * .5), replace = FALSE)

peopleTrainImages = allPeople[peopleTrain]

remainingPeople = allPeople[-peopleTrain]

peopleValidate = sample(1:length(remainingPeople), round(length(remainingPeople) * .5), replace = FALSE)

peopleValidateImages = remainingPeople[peopleValidate]

peopleTestImages = remainingPeople[-peopleValidate]

# Split monsters into train/validate/test

monstersTrain = sample(1:length(allMonsters), round(length(allMonsters) * .5), replace = FALSE)

monstersTrainImages = allMonsters[monstersTrain]

remainingMonsters = allMonsters[-monstersTrain]

monstersValidate = sample(1:length(remainingMonsters), round(length(remainingMonsters) * .5), replace = FALSE)

monstersValidateImages = remainingMonsters[monstersValidate]

monstersTestImages = remainingMonsters[-monstersValidate]

# Create directories and locate the images appropriately.

dir.create("D:/projects/courses/unstructured/data/kerasImages/train")

dir.create("D:/projects/courses/unstructured/data/kerasImages/validate")

dir.create("D:/projects/courses/unstructured/data/kerasImages/test")

dir.create("D:/projects/courses/unstructured/data/kerasImages/train/people")

dir.create("D:/projects/courses/unstructured/data/kerasImages/validate/people")

dir.create("D:/projects/courses/unstructured/data/kerasImages/test/people")

dir.create("D:/projects/courses/unstructured/data/kerasImages/train/monsters")

dir.create("D:/projects/courses/unstructured/data/kerasImages/validate/monsters")

dir.create("D:/projects/courses/unstructured/data/kerasImages/test/monsters")

file.copy(file.path("D:/projects/courses/unstructured/data/kerasImages/monsters/", monstersTrainImages),
          file.path("D:/projects/courses/unstructured/data/kerasImages/train/monsters")) 

file.copy(file.path("D:/projects/courses/unstructured/data/kerasImages/monsters/", monstersValidateImages),
          file.path("D:/projects/courses/unstructured/data/kerasImages/validate/monsters")) 

file.copy(file.path("D:/projects/courses/unstructured/data/kerasImages/monsters/", monstersTestImages),
          file.path("D:/projects/courses/unstructured/data/kerasImages/test/monsters")) 

file.copy(file.path("D:/projects/courses/unstructured/data/kerasImages/people/", peopleTrainImages),
          file.path("D:/projects/courses/unstructured/data/kerasImages/train/people")) 

file.copy(file.path("D:/projects/courses/unstructured/data/kerasImages/people/", peopleValidateImages),
          file.path("D:/projects/courses/unstructured/data/kerasImages/validate/people")) 

file.copy(file.path("D:/projects/courses/unstructured/data/kerasImages/people/", peopleValidateImages),
          file.path("D:/projects/courses/unstructured/data/kerasImages/test/people")) 

We are just going to hold off on using this for a while, but it will be important in a bit.

TensorFlow

Here is a little known fact – R is slightly slow. Why? Mostly because of its low-level interpreter. For most analyses, R is relying on a Basic Linear Algebra System (BLAS). The default BLAS is used for its stability and not at all for its speed. Any Linux-based users can switch it out easily, but Windows is a massive pain. Great to know, but why is it important. Once you get your head around the prospect that languages have various numeric computation engines living beneath them, it starts to open door to increasing speed.

TensorFlow is nothing but a computation engine and there are many such engines out there now: Torch, Caffe, H_2O, and Theano, to name a few. TensorFlow, however, seems to be doing a pretty good job at being popular. In addition to its popularity, it can also play nicely in many different languages. It does not hurt that it was developed and is still used by Google.

Tensor flow relies on…tensors. These are just multidimensional arrays and can be thought of mapping data up one dimension higher.

Graphics Processing Unit

While being able to take advantage of our CPU is useful, the GPU is the king of speed. Most of the fancy stuff that we are going to be doing will run on a CPU, but it can take some time to finish.

This is the computational embodiment of ā€œwould you rather fight a horse or 100 geese?ā€. If we think about what GPUs are asked to do – transorm data into different spaces – we can see where they might be useful for statistics.

Keras

Keras has been built as a front-end for different lower-level libraries (like TensorFlow). It just serves to make TensorFlow (or a few other backend) easier to use.

Installation Instructions

Deep Convolutional Neural Networks

While we learned a bit about neural networks, the simple perceptron model is somewhat old news. Given the increase in computational power, new neural networks have become the go to model for speech and image classification.

These models are particularly well-suited for images, but can be used on just about regression or classification problem that you might encounter.

From a conceptual standpoint, they are not world away from what we previously saw. We have an input that gets passed through layers, weights and computations are applied through the layers, output is achieved and the system starts again. The biggest difference between these deep nets and what we saw a few weeks ago is in the complexity. We can have many hidden layers and the layers can be incredibly deep.

Here is the same diagram, but with a clearer depiction.

Here we are breaking image features down that get passed into the convolutional layer (typically with a Rectified Linear Unit activation).

The softmax function we see at the end is just providing the mapping of the output to class probability distribution.

The size of the boxes in both figures help to illustrate an important point: the first layers extract very high-level features (just like the features we saw last time). They are picking up on things like lines, colors, and course shapes. As we pass into the next layers, the feature map becomes smaller – here, we are picking up on small details within the images. If we use a pretrained model, we can freeze those top layers; this will help to keep the broad generalizations from the models.

Regression

Classification is always a fun task, but we should note that anything we would normally do on the modeling side can be done in Keras.

Let’s go back to that attrition data that we played with when we were using Naive Bayes. Data prep is always the trickiest part of these models:


splits = initial_split(attrition, .6)

trainingDataWhole = training(splits)

trainingDataWhole = trainingDataWhole %>% 
  select_if(is.numeric)

trainingDataScaled = scale(trainingDataWhole)

colMeansTraining = attr(trainingDataScaled, "scaled:center")

colSDTraining = attr(trainingDataScaled, "scaled:scale")

trainingDataScaled = as.data.frame(trainingDataScaled)

testingDataWhole = testing(splits)

testingDataScaled = testingDataWhole %>% 
  select_if(is.numeric) 

testingDataScaled = as.data.frame(scale(testingDataScaled, 
                                        center = colMeansTraining, scale = colSDTraining))

yVar = which(names(attrition) == "MonthlyIncome")

trainingY = as.vector(trainingDataScaled$MonthlyIncome)

trainingData = data.matrix(trainingDataScaled[, -c(yVar)])

testingY = as.vector(testingDataScaled$MonthlyIncome)

testingData = data.matrix(testingDataScaled[, -c(yVar)])

model = keras_model_sequential() %>%
    layer_dense(units = 64, activation = "relu",
                input_shape = dim(trainingData)[2]) %>%
    layer_dense(units = 64, activation = "relu") %>%
    layer_dense(units = 1)
  
model %>% 
  compile(loss = "mse", optimizer = optimizer_rmsprop(),
          metrics = list("mean_absolute_error"))

model %>% summary()

history = model %>% 
  fit(trainingData, trainingY,
  epochs = 50, validation_split = 0.2,
  verbose = 2)


plot(history, metrics = "mean_absolute_error", smooth = FALSE) +
  coord_cartesian(xlim = c(0, 55), ylim = c(0, .5)) + 
  theme_minimal()


predictedValues = model %>% predict(testingData)

plot(predictedValues, testingDataScaled$MonthlyIncome)

That is all great, but let’s see what we would have gotten with a normal linear regression:


trainModel = lm(MonthlyIncome ~ ., data = trainingDataScaled)

lmPredicted = predict(trainModel, testingDataScaled)

plot(lmPredicted, testingDataScaled$MonthlyIncome)

The Black Box…

While getting a clear look into the black box can be tricky, there are ways to help to explain what is going on within any of these machine learning algorithms. Locally Interpretable Model-Agnostic Explanations

The lime package will give us some insight into our models.


library(lime)

model_type.keras.engine.sequential.Sequential = function(x, ...) "regression"

# Setup lime::predict_model()
predict_model.keras.engine.sequential.Sequential  = function(x, newdata, type, ...) {
  pred = predict(object = x, x = as.matrix(newdata))
  data.frame(pred) 
}

predict_model(x = model, newdata = trainingData, type = 'raw')

explainer <- lime(x = as.data.frame(trainingData), model)

explanation = explain(as.data.frame(testingData), explainer, n_features = 2)

Text Classification

Since we are going to be using TensorFlow, we will need to convert our data to tensors. It is a substantial amount of work. Functions (or functional programming) should definitely be used here, but this will just make the steps concrete.


library(dplyr)

load("D:/projects/courses/unstructured/data/zappaOrCountry.RData")

zappaOrCountry = zappaOrCountry %>% 
  mutate(zappaNY = ifelse(genre == "zappa", 1, 0)) %>% 
  select(text, zappaNY)

splits = initial_split(zappaOrCountry, .6, "zappaNY")

trainingDataWhole = training(splits)

testingDataWhole = testing(splits)

trainingLabel = as.vector(trainingDataWhole$zappaNY)

trainingData = as.array(trainingDataWhole[, -c(2)])

testingLabel = as.vector(testingDataWhole$zappaNY)

testingData = as.array(testingDataWhole[, -c(2)])

tokenizerTrain = text_tokenizer(num_words = 50000)

fit_text_tokenizer(tokenizerTrain, trainingData) # Many things are modified in place...

trainingData = texts_to_sequences(tokenizerTrain, trainingData)

tokenizerTest = text_tokenizer(num_words = 50000)

fit_text_tokenizer(tokenizerTest, testingData)

testingData = texts_to_sequences(tokenizerTest, testingData)

trainingData = pad_sequences(trainingData, value = 0,
  padding = "post", maxlen = 400)

testingData = pad_sequences(testingData, value = 0,
  padding = "post", maxlen = 400)

For the love of all that is good, we can start getting to our model now:


vocabSize = 50000

model = keras_model_sequential()

model %>% 
  layer_embedding(input_dim = vocabSize, output_dim = 16) %>%
  layer_global_average_pooling_1d() %>%
  layer_dense(units = 16, activation = "relu") %>%
  layer_dense(units = 1, activation = "sigmoid")

model %>% summary()

model %>% compile(optimizer = 'adam', loss = 'binary_crossentropy',
  metrics = list('accuracy'))

xValidation = trainingData[1:500, ]

xTraining = trainingData[501:nrow(trainingData), ]

yValidation = trainingLabel[1:500]

yTraining = trainingLabel[501:length(trainingLabel)]

history = model %>% 
  fit(xTraining, yTraining,
      epochs = 40, batch_size = 50,
      validation_data = list(xValidation, yValidation),
      verbose = 3)

We can see that our model generally improved after every epoch (we do see some that our validation tended to drop). The loss is notable here and just refers to the distance of an observation from a prediction – as we approach 0, we are getting closer to perfect prediction.

Model Explanations

For giggles, we can take the same data and run it through a boosted gradient descent classifier.


getMatrix = function(text) {
  tokenized = itoken(text, progressbar = FALSE)
  create_dtm(tokenized, vectorizer = hash_vectorizer())
}

trainingDTM = getMatrix(trainingDataWhole$text)

testingDTM = getMatrix(testingDataWhole$text)

params = list(max_depth = 7, eta = 0.1, 
              objective = "binary:logistic", 
              eval_metric = "error", 
              nthread = 1)

xgbModel = xgb.train(params, xgb.DMatrix(trainingDTM, 
                                          label = trainingDataWhole$zappaNY),
  nrounds = 50)

predictions = predict(xgbModel, testingDTM) > 0.5

test_labels = testingDataWhole$zappaNY

meanAccuracy = mean(predictions == test_labels)

meanAccuracy

[1] 0.9526531

sentenceExplanation = head(testingDataWhole$text[testingDataWhole$zappaNY == 0], 15)

explainerCountry = lime(sentenceExplanation, model = xgbModel, 
                  preprocess = getMatrix)

explanationCountry = explain(sentenceExplanation, explainerCountry, n_labels = 1, 
                       n_features = 2)

explanationCountry[, 2:9]

zappaExplanation = tail(testingDataWhole$text[testingDataWhole$zappaNY == 1], 3)

explainerZappa = lime(zappaExplanation, model = xgbModel, 
                  preprocess = getMatrix)

explanationZappa = explain(zappaExplanation, explainerZappa, n_labels = 1, 
                       n_features = 2)

explanationZappa[, 2:9]

Images

Now we can go back to those images we created earlier. The previous deep net model we ran was just that – a deep net. Since we are dealing with images, now, we can use the convolutional layers.

We will replicate the model diagram that we saw earlier, but we could obviously tweak until we could not see straight.


trainingDirectory = "D:/projects/courses/unstructured/data/kerasImages/train/"

validationDirectory = "D:/projects/courses/unstructured/data/kerasImages/validate/"

imageWidth = 20

imageHeight = 20

targetSize = c(imageWidth, imageHeight)

# RGB = 3 channels

channels = 3

# We can increase our training set by using image generation.
# This will decrease our chances of overfitting with small data.

trainingDataGenerator = image_data_generator(rescale = 1/255, rotation_range = 40,
                                             width_shift_range = 0.2,
                                             height_shift_range = 0.2,
                                             shear_range = 0.2,
                                             zoom_range = 0.2,
                                             horizontal_flip = TRUE)

validationDataGenerator = image_data_generator(rescale = 1/255)  

testDataGenerator = image_data_generator(rescale = 1/255)  

trainingImagesArray = flow_images_from_directory(trainingDirectory, 
                                          trainingDataGenerator,
                                          target_size = targetSize,
                                          class_mode = "categorical",
                                          classes = c("people", "monsters"),
                                          seed = 1001)

# validation images
validationImagesArray = flow_images_from_directory(validationDirectory, 
                                          validationDataGenerator,
                                          target_size = targetSize,
                                          class_mode = "categorical",
                                          classes = c("people", "monsters"),
                                          seed = 1001)

trainSamplesN = trainingImagesArray$n

validateSamplesN = validationImagesArray$n

model = keras_model_sequential()

# add layers
model %>%
  layer_conv_2d(filter = 32, kernel_size = c(3, 3), padding = "same", 
                input_shape = c(imageWidth, imageHeight, channels)) %>%
  layer_activation("relu") %>% 
  
  # Second hidden layer
  layer_conv_2d(filter = 16, kernel_size = c(3, 3), padding = "same") %>%
  layer_activation_leaky_relu(0.5) %>%
  layer_batch_normalization() %>%

  # Use max pooling
  layer_max_pooling_2d(pool_size = c(2, 2)) %>%
  layer_dropout(0.25) %>%
  
  # Flatten max filtered output into feature vector 
  # and feed into dense layer
  layer_flatten() %>%
  layer_dense(100) %>%
  layer_activation("relu") %>%
  layer_dropout(0.5) %>%

  # Outputs from dense layer are projected onto output layer
  layer_dense(2) %>% 
  layer_activation("softmax")

# compile
model %>% compile(
  loss = "categorical_crossentropy",
  optimizer = optimizer_rmsprop(lr = 0.0001, decay = 1e-6),
  metrics = "accuracy")

history = model %>% fit_generator(trainingImagesArray,
  steps_per_epoch = as.integer(trainSamplesN / 32), 
  epochs = 15, validation_data = validationImagesArray,
  validation_steps = as.integer(validateSamplesN / 32),
  verbose = 2)

We can see that we started getting pretty good accuracy towards the end of our epochs.

Pretrained Models

There will be times to train your own models, but sometimes a previously-used model is useful. If you are trying to identify common items, we might use the vgg16 model with imagenet weights. Many of the pretrained models are going to identify common objects (e.g., Common Objects in Context).


plot_superpixels("D:/projects/courses/unstructured/data/kerasImages/train/monsters/monster001.jpg")


testMonster = image_read("D:/projects/courses/unstructured/data/kerasImages/test/monsters/monster007.jpg")

image_write(testMonster, "D:/projects/courses/unstructured/data/kerasImages/test/monsters/monsterTest.jpg")

testPerson = image_read("D:/projects/courses/unstructured/data/kerasImages/train/people/person026.jpg")

image_write(testPerson, "D:/projects/courses/unstructured/data/kerasImages/test/people/personTest.jpg")

testMonster = "D:/projects/courses/unstructured/data/kerasImages/test/monsters/monsterTest.jpg" 

testPerson = "D:/projects/courses/unstructured/data/kerasImages/test/people/personTest.jpg"

model2 = application_vgg16(weights = "imagenet", include_top = TRUE)

model_labels = readRDS(system.file('extdata', 'imagenet_labels.rds', package = 'lime'))

image_prep = function(x) {
  arrays = lapply(x, function(path) {
    img = image_load(path, target_size = c(224,224))
    x = image_to_array(img)
    x = array_reshape(x, c(1, dim(x)))
    x = imagenet_preprocess_input(x)
  })
  do.call(abind::abind, c(arrays, list(along = 1)))
}

res = predict(model2, image_prep(c(testPerson, testMonster)))

imagenet_decode_predictions(res)

[[1]]
  class_name class_description      score
1  n04350905              suit 0.48977816
2  n03838899              oboe 0.13054679
3  n04591157       Windsor_tie 0.06738417
4  n02804610           bassoon 0.06695645
5  n02883205           bow_tie 0.03743355

[[2]]
  class_name class_description      score
1  n07753275         pineapple 0.68614203
2  n07753592            banana 0.05868377
3  n03991062               pot 0.05239671
4  n12768682           buckeye 0.03940406
5  n04522168              vase 0.01267886

Those appear to be interesting pictures.

Let’s check them out.


explainer = lime(c(testPerson, testMonster), 
                 as_classifier(model2, model_labels), image_prep)

explanation = explain(c(testPerson, testMonster), explainer, 
                       n_labels = 2, n_features = 35,
                       n_superpixels = 35, weight = 10,
                       background = "white")

plot_image_explanation(explanation)

plot_image_explanation(explanation[explanation$case == "monsterTest.jpg", ])

Services

Google

Microsoft

Fun Stuff

Style Transfer

Google DeepDream

Julia

Sunrise…sunset. Everything comes and goes, and languages are not any different.

While it is not there yet, there is a reasonable chance that Julia is going to cut into the space that R and Python hold.

While we might not need another tool right now, Julia offers some massive advantages over both R and Python. The biggest is pure speed. While it is still young, be on the look-out for it.